notebook.community

Edit and run



In [3]:

    
import pandas as pd
import numpy as np
import os
import glob
import nltk.data
import nltk, re, pprint
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.metrics.pairwise import linear_kernel
import sqlite3



def connect_db():
	return sqlite3.connect('/Users/sheldon/podcasts/test.db')

def create_df_object():
	conn = sqlite3.connect('/Users/sheldon/podcasts/test.db')
	df = pd.read_sql("select * from podcast",conn)
	return df

df = create_df_object()

stop = set(stopwords.words('english'))



In [6]:

    
#df.head()
import psycopg2
import sys
from sqlalchemy import create_engine
engine = create_engine('postgresql://sheldon@localhost:5432/sheldon')
df1 = pd.read_sql("select * from podcasts",engine)



In [7]:

    
df1.query("select *")









    



  File "<unknown>", line 1
    select *
            ^
SyntaxError: invalid syntax



In [3]:

    
def remove_stop_words(row):
    tokens = word_tokenize(str(row))
    tokens = [w for w in tokens if not w in stop]
    tokens = [word for word in tokens if not "'" in word]
    return ' '.join(tokens)

df['transcribed'] = df['transcribed'].apply(remove_stop_words)
texts = df.transcribed.tolist()

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] +=1



In [4]:

    
from gensim import corpora, models, similarities
import gensim



In [4]:

    
'''class MyCorpus(object):
    def __iter__(self):
        for doc in docs:
            yield dictionary.doc2bow(doc.split())
corpus_mem_friendly = MyCorpus()
corpora.MmCorpus.serialize('corpus.mm',corpus_mem_friendly)
dictionary.save('words.dict')
df["review_text"] = df["transcribed"].map(lambda x: x.split(' '))
from gensim import corpora
dictionary = corpora.Dictionary(df["review_text"])
'''









    Out[4]:





'class MyCorpus(object):\n    def __iter__(self):\n        for doc in docs:\n            yield dictionary.doc2bow(doc.split())\ncorpus_mem_friendly = MyCorpus()\ncorpora.MmCorpus.serialize(\'corpus.mm\',corpus_mem_friendly)\ndictionary.save(\'words.dict\')\ndf["review_text"] = df["transcribed"].map(lambda x: x.split(\' \'))\nfrom gensim import corpora\ndictionary = corpora.Dictionary(df["review_text"])\n'



In [9]:

    
#load all the stuff
dictionary = corpora.Dictionary.load('models/words.dict')
corpus = corpora.MmCorpus.load('models/corpus.mm')
tfidf = gensim.models.tfidfmodel.TfidfModel.load('models/tfidf_model')
lsi = gensim.models.lsimodel.LsiModel.load('models/model.lsi')
index = similarities.MatrixSimilarity.load('models/corpus.index')
lda = gensim.models
#tfidf.save('tfidf_model')
lsi.save('models/model.lsi')
#tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=75)
corpus_lsi = lsi[corpus_tfidf]









    



---------------------------------------------------------------------------
UnpicklingError                           Traceback (most recent call last)
<ipython-input-9-2343d514bd53> in <module>()
      1 #load all the stuff
      2 dictionary = corpora.Dictionary.load('models/words.dict')
----> 3 corpus = corpora.MmCorpus.load('models/corpus.mm')
      4 tfidf = gensim.models.tfidfmodel.TfidfModel.load('models/tfidf_model')
      5 lsi = gensim.models.lsimodel.LsiModel.load('models/model.lsi')

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/site-packages/gensim/utils.pyc in load(cls, fname, mmap)
    246         compress, subname = SaveLoad._adapt_by_suffix(fname)
    247 
--> 248         obj = unpickle(fname)
    249         obj._load_specials(fname, mmap, compress, subname)
    250         return obj

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/site-packages/gensim/utils.pyc in unpickle(fname)
    909     with smart_open(fname) as f:
    910         # Because of loading from S3 load can't be used (missing readline in smart_open)
--> 911         return _pickle.loads(f.read())
    912 
    913 

UnpicklingError: invalid load key, '%'.



In [8]:

    
def get_related_podcasts(index):
    def getKey(item):
        return item[1]
    listOfTopics = 
    corpus = corpus_lsi[index]
    corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
    related_df = pd.DataFrame(corpus,columns=['index','score'])
    final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
    return final_df

get_related_podcasts(1)









    



  File "<ipython-input-8-b3a4dec5989c>", line 4
    listOfTopics =
                   ^
SyntaxError: invalid syntax



In [11]:

    
lsi.show_topics(num_words=100)[1]
def getKey(item):
        return item[1]
sorted(corpus_lsi[17], key=getKey,reverse=True)
sorted(lsi.show_topic(8,topn=100), key=getKey, reverse=True)









    Out[11]:





[(u'economics', 0.20075241030479216),
 (u'economists', 0.096918993194952341),
 (u'gender', 0.086645740787323997),
 (u'offender', 0.080986352314974794),
 (u'coin', 0.068897077342306143),
 (u'offenders', 0.068654664387510944),
 (u'marijuana', 0.068175380206256297),
 (u'apron', 0.066312516353626047),
 (u'financial', 0.062462980433143037),
 (u'education', 0.061449591429715587),
 (u'police', 0.061018826813492695),
 (u'petty', 0.059712896718308241),
 (u'preferences', 0.056998621035908949),
 (u'assaulting', 0.055874036704667569),
 (u'charities', 0.054803188035918111),
 (u'kidney', 0.0544317951725444),
 (u'officer', 0.052501476182249304),
 (u'caleb', 0.052433083247781884),
 (u'diploma', 0.050797805357103633),
 (u'registry', 0.050217514569610322),
 (u'giants', 0.050112168659068236),
 (u'driver', 0.048065446698679702),
 (u'reaganomics', 0.046414306760494413),
 (u'game', 0.045799533122943097),
 (u'economist', 0.045648828501652632),
 (u'crime', 0.044992693069287437),
 (u'diplomas', 0.044448079687465711),
 (u'radio', 0.044157869116420258),
 (u'currency', 0.043632198572000444),
 (u'cowboys', 0.043251296345787796),
 (u'utopia', 0.042731704755242149),
 (u'tommy', 0.042726015553238447),
 (u'steve', 0.042248189766779923),
 (u'alcohol', 0.042056066901713353),
 (u'games', 0.041866191450996126),
 (u'trophy', 0.041702718345644214),
 (u'carolina', 0.041576761668042375),
 (u'preference', 0.041406695996573532),
 (u'packers', 0.040655820422947908),
 (u'denver', 0.040584694736775735),
 (u'freak', 0.040476103140352888),
 (u'trophies', 0.0389693134498569),
 (u'thump', 0.038741618715308276),
 (u'bono', 0.038338917796152283),
 (u'firstborn', 0.037978874195838147),
 (u'hargreaves', 0.03764094449513268),
 (u'gift', 0.037495529218878103),
 (u'restrooms', 0.037270876644085002),
 (u'mortified', 0.036611151467359319),
 (u'pittsburgh', 0.036132237090812522),
 (u'chicago', 0.036118557649630158),
 (u'mortgages', 0.03608682634714637),
 (u'pornography', 0.036065406294734731),
 (u'morgan', 0.035849356936325023),
 (u'blog', 0.035848506091984936),
 (u'sex', 0.035681433053336195),
 (u'sister', 0.035546112638106628),
 (u'crimes', 0.035543037065394176),
 (u'minus', 0.035494555582051744),
 (u'tribune', 0.034926183742266491),
 (u'criminal', 0.034920818902150597),
 (u'charges', 0.03489386369348807),
 (u'markets', 0.034731668978308303),
 (u'jets', 0.033874664078435396),
 (u'degree', 0.033866707790161196),
 (u'dean', 0.033815812714591047),
 (u'stanley', 0.03370301132013459),
 (u'minnesota', 0.03333750931794318),
 (u'paddy', 0.033144301705865811),
 (u'sesame', 0.032815701742337736),
 (u'zoom', 0.032460020806240768),
 (u'embarking', -0.032504793891144711),
 (u'donald', -0.033636813452894584),
 (u'improving', -0.033735014880266589),
 (u'shane', -0.034054833765613128),
 (u'ladies', -0.03477144701118861),
 (u'genetic', -0.035370858950304208),
 (u'patient', -0.035657589553726811),
 (u'medication', -0.036161178193132575),
 (u'hype', -0.03704937890596121),
 (u'asia', -0.037416770020017948),
 (u'ha', -0.037651017605805283),
 (u'polls', -0.037791308688583568),
 (u'franklin', -0.037857829243672328),
 (u'diabetes', -0.04025621090744548),
 (u'black', -0.040731944805092277),
 (u'republican', -0.040976605810292319),
 (u'trump', -0.041859603230542157),
 (u'queen', -0.043730798342776406),
 (u'shame', -0.044976328399778727),
 (u'hap', -0.045146926700645996),
 (u'movies', -0.051521539418352212),
 (u'cream', -0.051756938493142189),
 (u'swiss', -0.056873965391394714),
 (u'philadelphia', -0.062535190303895261),
 (u'movie', -0.065527781760655185),
 (u'health', -0.065892805668891316),
 (u'patients', -0.097967991649763622),
 (u'vulnerability', -0.10969328881746618),
 (u'queens', -0.11206711629168782)]



In [246]:

    
top_topics









    Out[246]:





[(8, 0.17896380730880518),
 (21, 0.091766397149891432),
 (11, 0.065616352648861648),
 (13, 0.064971534129887182),
 (18, 0.025042972421945916),
 (14, 0.024075613086501517),
 (6, 0.015937881772443523),
 (3, 0.01362197571894958),
 (5, 0.013074554462267355),
 (4, 0.0088382479607463076),
 (9, 0.0022015115732176059),
 (19, -0.027478503662382983),
 (1, -0.033732093283322856),
 (2, -0.03667141346621143),
 (17, -0.043455623803401247),
 (7, -0.043792188330803443),
 (15, -0.045263199178342921),
 (16, -0.045927792542153609),
 (12, -0.07568664639746138),
 (22, -0.081326541458310503),
 (24, -0.10229900073505233),
 (0, -0.11914869474184712),
 (10, -0.13419026138406509),
 (23, -0.20864550487239134),
 (20, -0.23951662981995908)]



In [216]:

    
corpus_lsi[1]









    Out[216]:





[(0, -0.30666008595379457),
 (1, -0.11729049248029232),
 (2, 0.09465178780371751),
 (3, 0.1021867895086989),
 (4, -0.055071455362356046),
 (5, 0.042427461451872366),
 (6, -0.099190452301410284),
 (7, -0.15977382403279178),
 (8, -0.19050174225760189),
 (9, 0.11962812051264791),
 (10, -0.062988823786490372),
 (11, 0.15656530181930398),
 (12, -0.088286265017136933),
 (13, -0.068219332021130799),
 (14, 0.074681883889618314),
 (15, -0.010344141539310351),
 (16, -0.011351173648180354),
 (17, 0.045443599720397507),
 (18, 0.011356358861277709),
 (19, 0.020344898048179155),
 (20, 0.016910421609358444),
 (21, -0.007355589844098882),
 (22, -0.032113079486917967),
 (23, -0.047263684081847689),
 (24, 0.0025554615751745072)]



In [9]:

    
def get_related_podcasts(index):
    def getKey(item):
        return item[1]
    corpus = corpus_lsi[index]
    corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
    related_df = pd.DataFrame(corpus,columns=['index','score'])
    final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
    return final_df

related_podcasts = list(get_related_podcasts(1)['index'])

def get_topics_per_podcast(podcast_index):
    topic_ids = [i for i in sorted(corpus_lsi[podcast_index], key=getKey, reverse=True) if i[1] > 0.10]
    def get_topic_arrays(topic_ids):
        x = []
        for id in topic_ids:
            list_of_words = sorted(lsi.show_topic(id[0], topn=5),key=getKey, reverse=True)
            z = []
            for word in list_of_words:
                if word[1] > .05:
                    z.append(word)
            x.append(z)
        return x
    topic_arrays = get_topic_arrays(topic_ids)
    return topic_arrays
testing = [[related_podcasts[i],get_topics_per_podcast(related_podcasts[i])] for i in range(0, len(related_podcasts))]



In [12]:

    
testing
x = pd.DataFrame(testing, columns=['index','words'])
x.words.ix[0]









    Out[12]:





[[(u'tommy', 0.13349055465001164),
  (u'petrified', 0.13210484035640158),
  (u'elkins', 0.12485860653687339),
  (u'trump', 0.11205463919933509)],
 [(u'lakeview', 0.16230038683391426), (u'chandler', 0.13397388194476367)],
 [(u'dean', 0.15605057080987761), (u'police', 0.097709114659441917)],
 [(u'movie', 0.19523593887426033),
  (u'movies', 0.13411589375868901),
  (u'assaulting', 0.092371566597563279)]]



In [150]:

    
def get_related_podcasts(query):
    vec_box = dictionary.doc2bow(query.split())
    vec_lsi = lsi[vec_box]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])[0:10]
    related_df = pd.DataFrame(sims,columns=['index','score'])
    def get_related_podcasts_list(index):
        def getKey(item):
            return item[1]
        corpus = corpus_lsi[index]
        corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
        related_df = pd.DataFrame(corpus,columns=['index','score'])
        final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
        return final_df

    related_podcasts = list(get_related_podcasts_list(1)['index'])

    def get_topics_per_podcast(podcast_index):
        topic_ids = [i for i in sorted(corpus_lsi[podcast_index], key=getKey, reverse=True) if i[1] > 0.10]
        def get_topic_arrays(topic_ids):
            x = []
            for id in topic_ids:
                list_of_words = sorted(lsi.show_topic(id[0], topn=5),key=getKey, reverse=True)
                z = []
                for word in list_of_words:
                    if word[1] > .05:
                        z.append(word)
                x.append(z)
            return x
        topic_arrays = get_topic_arrays(topic_ids)
        return topic_arrays
    topics_per_podcast = [[related_podcasts[i],get_topics_per_podcast(related_podcasts[i])] for i in range(0, len(related_podcasts))]
    other_df = pd.DataFrame(topics_per_podcast, columns=['topic_index','words'])
    final_df = pd.merge(related_df, df)
    test_final_df = pd.merge(other_df, final_df,left_index=True,right_index=True)[['words','index','score','episode','series']]
    return test_final_df



In [161]:

    
x = get_related_podcasts('cats')
zz = x.words.ix[0]



In [172]:

    
zz[1]









    Out[172]:





[(u'lakeview', 0.16230038683391426), (u'chandler', 0.13397388194476367)]



In [151]:

    
test









    Out[151]:






  
    
      
      id
      rank
    
  
  
    
      0
      20
      0.291095
    
    
      1
      29
      0.225385
    
    
      2
      23
      0.218698
    
    
      3
      30
      0.197811
    
    
      4
      22
      0.185274
    
    
      5
      17
      0.174860
    
    
      6
      24
      0.160806
    
    
      7
      34
      0.159709
    
    
      8
      26
      0.139874
    
    
      9
      28
      0.128635
    
    
      10
      25
      0.124274
    
    
      11
      21
      0.119610
    
    
      12
      27
      0.111168
    
    
      13
      32
      0.108407
    
    
      14
      18
      0.091393
    
    
      15
      31
      0.054864
    
    
      16
      89
      0.023582
    
    
      17
      33
      0.022864
    
    
      18
      78
      0.008958
    
    
      19
      19
      -0.003617
    
    
      20
      62
      -0.021017
    
    
      21
      119
      -0.025039
    
    
      22
      74
      -0.027536
    
    
      23
      121
      -0.031408
    
    
      24
      137
      -0.037055
    
    
      25
      68
      -0.037749
    
    
      26
      115
      -0.037889
    
    
      27
      122
      -0.039599
    
    
      28
      99
      -0.042506
    
    
      29
      65
      -0.048880
    
    
      ...
      ...
      ...
    
    
      121
      47
      -0.194450
    
    
      122
      0
      -0.195210
    
    
      123
      3
      -0.197204
    
    
      124
      107
      -0.197657
    
    
      125
      45
      -0.201181
    
    
      126
      12
      -0.201359
    
    
      127
      8
      -0.201887
    
    
      128
      101
      -0.203043
    
    
      129
      149
      -0.204315
    
    
      130
      112
      -0.204922
    
    
      131
      54
      -0.205141
    
    
      132
      50
      -0.205235
    
    
      133
      150
      -0.205884
    
    
      134
      147
      -0.208164
    
    
      135
      1
      -0.208597
    
    
      136
      53
      -0.209838
    
    
      137
      9
      -0.210375
    
    
      138
      52
      -0.210404
    
    
      139
      108
      -0.212018
    
    
      140
      104
      -0.214856
    
    
      141
      148
      -0.216398
    
    
      142
      102
      -0.216993
    
    
      143
      7
      -0.221978
    
    
      144
      5
      -0.222895
    
    
      145
      109
      -0.222942
    
    
      146
      86
      -0.225827
    
    
      147
      105
      -0.230408
    
    
      148
      106
      -0.231272
    
    
      149
      111
      -0.236596
    
    
      150
      103
      -0.241512
    
  

151 rows × 2 columns



In [146]:

    
tf = TfidfVectorizer(stop_words=stop)
tfidf_matrix = tf.fit_transform(df['transcribed'])
copy_matrix = tf.transform(df['transcribed'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)



In [130]:

    
query = 'python economics love'
trans_query = query.lower()
trans_query = query.split()
tfidf_matrix_test = tf.fit_transform(trans_query)
tfidf_matrix_train = tf.transform(df['transcribed'])
tfidf_matrix_train.todense()
tfidf_matrix_test.todense()
query_similarities = linear_kernel(tfidf_matrix_test, tfidf_matrix_train)
query_similarities = query_similarities.argsort()[0][::-1]
pod_dict = dict(zip(range(0, len(query_similarities)),query_similarities))
pod_dict = pd.DataFrame({'rank':pod_dict.keys()}, index=pod_dict.values())
#related_podcasts_df = pd.DataFrame.join(pod_dict, df, how='inner')
#final_df = related_podcasts_df.sort_values('rank')[1:11][['rank','episode','series']]
#related_podcasts = final_df['episode']



In [131]:

    
pod_dict









    Out[131]:






  
    
      
      rank
    
  
  
    
      148
      0
    
    
      113
      1
    
    
      150
      2
    
    
      45
      3
    
    
      52
      4
    
    
      51
      5
    
    
      50
      6
    
    
      49
      7
    
    
      48
      8
    
    
      47
      9
    
    
      46
      10
    
    
      44
      11
    
    
      54
      12
    
    
      43
      13
    
    
      42
      14
    
    
      41
      15
    
    
      40
      16
    
    
      39
      17
    
    
      38
      18
    
    
      53
      19
    
    
      55
      20
    
    
      36
      21
    
    
      56
      22
    
    
      71
      23
    
    
      70
      24
    
    
      69
      25
    
    
      68
      26
    
    
      67
      27
    
    
      66
      28
    
    
      65
      29
    
    
      ...
      ...
    
    
      86
      121
    
    
      85
      122
    
    
      84
      123
    
    
      83
      124
    
    
      82
      125
    
    
      81
      126
    
    
      80
      127
    
    
      79
      128
    
    
      78
      129
    
    
      77
      130
    
    
      76
      131
    
    
      91
      132
    
    
      92
      133
    
    
      93
      134
    
    
      102
      135
    
    
      108
      136
    
    
      107
      137
    
    
      106
      138
    
    
      105
      139
    
    
      104
      140
    
    
      103
      141
    
    
      101
      142
    
    
      94
      143
    
    
      100
      144
    
    
      99
      145
    
    
      98
      146
    
    
      97
      147
    
    
      96
      148
    
    
      95
      149
    
    
      0
      150
    
  

151 rows × 1 columns



In [ ]:

	id	rank
0	20	0.291095
1	29	0.225385
2	23	0.218698
3	30	0.197811
4	22	0.185274
5	17	0.174860
6	24	0.160806
7	34	0.159709
8	26	0.139874
9	28	0.128635
10	25	0.124274
11	21	0.119610
12	27	0.111168
13	32	0.108407
14	18	0.091393
15	31	0.054864
16	89	0.023582
17	33	0.022864
18	78	0.008958
19	19	-0.003617
20	62	-0.021017
21	119	-0.025039
22	74	-0.027536
23	121	-0.031408
24	137	-0.037055
25	68	-0.037749
26	115	-0.037889
27	122	-0.039599
28	99	-0.042506
29	65	-0.048880
...	...	...
121	47	-0.194450
122	0	-0.195210
123	3	-0.197204
124	107	-0.197657
125	45	-0.201181
126	12	-0.201359
127	8	-0.201887
128	101	-0.203043
129	149	-0.204315
130	112	-0.204922
131	54	-0.205141
132	50	-0.205235
133	150	-0.205884
134	147	-0.208164
135	1	-0.208597
136	53	-0.209838
137	9	-0.210375
138	52	-0.210404
139	108	-0.212018
140	104	-0.214856
141	148	-0.216398
142	102	-0.216993
143	7	-0.221978
144	5	-0.222895
145	109	-0.222942
146	86	-0.225827
147	105	-0.230408
148	106	-0.231272
149	111	-0.236596
150	103	-0.241512

	rank
148	0
113	1
150	2
45	3
52	4
51	5
50	6
49	7
48	8
47	9
46	10
44	11
54	12
43	13
42	14
41	15
40	16
39	17
38	18
53	19
55	20
36	21
56	22
71	23
70	24
69	25
68	26
67	27
66	28
65	29
...	...
86	121
85	122
84	123
83	124
82	125
81	126
80	127
79	128
78	129
77	130
76	131
91	132
92	133
93	134
102	135
108	136
107	137
106	138
105	139
104	140
103	141
101	142
94	143
100	144
99	145
98	146
97	147
96	148
95	149
0	150

	rank
148	0
113	1
150	2
45	3
52	4
51	5
50	6
49	7
48	8
47	9
46	10
44	11
54	12
43	13
42	14
41	15
40	16
39	17
38	18
53	19
55	20
36	21
56	22
71	23
70	24
69	25
68	26
67	27
66	28
65	29
...	...
86	121
85	122
84	123
83	124
82	125
81	126
80	127
79	128
78	129
77	130
76	131
91	132
92	133
93	134
102	135
108	136
107	137
106	138
105	139
104	140
103	141
101	142
94	143
100	144
99	145
98	146
97	147
96	148
95	149
0	150

Gettting Related Documents

Get Related documents based on query

	rank
148	0
113	1
150	2
45	3
52	4
51	5
50	6
49	7
48	8
47	9
46	10
44	11
54	12
43	13
42	14
41	15
40	16
39	17
38	18
53	19
55	20
36	21
56	22
71	23
70	24
69	25
68	26
67	27
66	28
65	29
...	...
86	121
85	122
84	123
83	124
82	125
81	126
80	127
79	128
78	129
77	130
76	131
91	132
92	133
93	134
102	135
108	136
107	137
106	138
105	139
104	140
103	141
101	142
94	143
100	144
99	145
98	146
97	147
96	148
95	149
0	150